#loading the required libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import plotly.express as px
#loading the data sets by pandas
df_ipl = pd.read_csv('matches.csv')
#checking the top 5 records
df_ipl.head()
| id | season | city | date | team1 | team2 | toss_winner | toss_decision | result | dl_applied | winner | win_by_runs | win_by_wickets | player_of_match | venue | umpire1 | umpire2 | umpire3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2017 | Hyderabad | 05-04-2017 | Sunrisers Hyderabad | Royal Challengers Bangalore | Royal Challengers Bangalore | field | normal | 0 | Sunrisers Hyderabad | 35 | 0 | Yuvraj Singh | Rajiv Gandhi International Stadium, Uppal | AY Dandekar | NJ Llong | NaN |
| 1 | 2 | 2017 | Pune | 06-04-2017 | Mumbai Indians | Rising Pune Supergiant | Rising Pune Supergiant | field | normal | 0 | Rising Pune Supergiant | 0 | 7 | SPD Smith | Maharashtra Cricket Association Stadium | A Nand Kishore | S Ravi | NaN |
| 2 | 3 | 2017 | Rajkot | 07-04-2017 | Gujarat Lions | Kolkata Knight Riders | Kolkata Knight Riders | field | normal | 0 | Kolkata Knight Riders | 0 | 10 | CA Lynn | Saurashtra Cricket Association Stadium | Nitin Menon | CK Nandan | NaN |
| 3 | 4 | 2017 | Indore | 08-04-2017 | Rising Pune Supergiant | Kings XI Punjab | Kings XI Punjab | field | normal | 0 | Kings XI Punjab | 0 | 6 | GJ Maxwell | Holkar Cricket Stadium | AK Chaudhary | C Shamshuddin | NaN |
| 4 | 5 | 2017 | Bangalore | 08-04-2017 | Royal Challengers Bangalore | Delhi Daredevils | Royal Challengers Bangalore | bat | normal | 0 | Royal Challengers Bangalore | 15 | 0 | KM Jadhav | M Chinnaswamy Stadium | NaN | NaN | NaN |
#looking the number of rows and columns in the data set
df_ipl.shape
(756, 18)
#Getting the frequency of most man of the match awards and ploting it
manofthematch=df_ipl['player_of_match'].value_counts().head(10)
fig=px.bar(manofthematch, y='player_of_match', color='player_of_match')
fig.show(renderer='notebook')
df_ipl.head(2)
| id | season | city | date | team1 | team2 | toss_winner | toss_decision | result | dl_applied | winner | win_by_runs | win_by_wickets | player_of_match | venue | umpire1 | umpire2 | umpire3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2017 | Hyderabad | 05-04-2017 | Sunrisers Hyderabad | Royal Challengers Bangalore | Royal Challengers Bangalore | field | normal | 0 | Sunrisers Hyderabad | 35 | 0 | Yuvraj Singh | Rajiv Gandhi International Stadium, Uppal | AY Dandekar | NJ Llong | NaN |
| 1 | 2 | 2017 | Pune | 06-04-2017 | Mumbai Indians | Rising Pune Supergiant | Rising Pune Supergiant | field | normal | 0 | Rising Pune Supergiant | 0 | 7 | SPD Smith | Maharashtra Cricket Association Stadium | A Nand Kishore | S Ravi | NaN |
#checking the various types of the results
resultplot=df_ipl['result'].value_counts()
px.bar(y=resultplot.index,x=resultplot.values,color=resultplot.index, color_discrete_sequence=px.colors.qualitative.Plotly)
df_ipl.head(2)
| id | season | city | date | team1 | team2 | toss_winner | toss_decision | result | dl_applied | winner | win_by_runs | win_by_wickets | player_of_match | venue | umpire1 | umpire2 | umpire3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2017 | Hyderabad | 05-04-2017 | Sunrisers Hyderabad | Royal Challengers Bangalore | Royal Challengers Bangalore | field | normal | 0 | Sunrisers Hyderabad | 35 | 0 | Yuvraj Singh | Rajiv Gandhi International Stadium, Uppal | AY Dandekar | NJ Llong | NaN |
| 1 | 2 | 2017 | Pune | 06-04-2017 | Mumbai Indians | Rising Pune Supergiant | Rising Pune Supergiant | field | normal | 0 | Rising Pune Supergiant | 0 | 7 | SPD Smith | Maharashtra Cricket Association Stadium | A Nand Kishore | S Ravi | NaN |
#checking the null values
df_ipl.isnull().sum().sort_values(ascending=False)
umpire3 637 city 7 winner 4 player_of_match 4 umpire2 2 umpire1 2 venue 0 win_by_wickets 0 win_by_runs 0 id 0 season 0 result 0 toss_decision 0 toss_winner 0 team2 0 team1 0 date 0 dl_applied 0 dtype: int64
#checking the persentage to take a deision
(df_ipl.isnull().mean()*100).round(2).sort_values(ascending = False)
umpire3 84.26 city 0.93 winner 0.53 player_of_match 0.53 umpire2 0.26 umpire1 0.26 venue 0.00 win_by_wickets 0.00 win_by_runs 0.00 id 0.00 season 0.00 result 0.00 toss_decision 0.00 toss_winner 0.00 team2 0.00 team1 0.00 date 0.00 dl_applied 0.00 dtype: float64
#droping unwanted columns
df_ipl = df_ipl.drop(columns=['umpire3'])
df_ipl.head(2)
| id | season | city | date | team1 | team2 | toss_winner | toss_decision | result | dl_applied | winner | win_by_runs | win_by_wickets | player_of_match | venue | umpire1 | umpire2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2017 | Hyderabad | 05-04-2017 | Sunrisers Hyderabad | Royal Challengers Bangalore | Royal Challengers Bangalore | field | normal | 0 | Sunrisers Hyderabad | 35 | 0 | Yuvraj Singh | Rajiv Gandhi International Stadium, Uppal | AY Dandekar | NJ Llong |
| 1 | 2 | 2017 | Pune | 06-04-2017 | Mumbai Indians | Rising Pune Supergiant | Rising Pune Supergiant | field | normal | 0 | Rising Pune Supergiant | 0 | 7 | SPD Smith | Maharashtra Cricket Association Stadium | A Nand Kishore | S Ravi |
#treating missing values
df_ipl['city'].replace(np.NaN,'Unknown',inplace=True)
df_ipl['winner'].replace(np.NaN,'Unknown',inplace=True)
df_ipl['player_of_match'].replace(np.NaN,'Unknown',inplace=True)
mode_value = df_ipl["umpire2"].mode()[0]
df_ipl['umpire2'].fillna(mode_value,inplace=True)
mode_value1 = df_ipl["umpire1"].mode()[0]
df_ipl['umpire1'].fillna(mode_value1,inplace=True)
df_ipl.isnull().sum().sort_values(ascending=False)
player_of_match 4 id 0 dl_applied 0 umpire1 0 venue 0 win_by_wickets 0 win_by_runs 0 winner 0 result 0 season 0 toss_decision 0 toss_winner 0 team2 0 team1 0 date 0 city 0 umpire2 0 dtype: int64
df_ipl[df_ipl['player_of_match'].isnull()]
| id | season | city | date | team1 | team2 | toss_winner | toss_decision | result | dl_applied | winner | win_by_runs | win_by_wickets | player_of_match | venue | umpire1 | umpire2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 300 | 301 | 2011 | Delhi | 21-05-2011 | Delhi Daredevils | Pune Warriors | Delhi Daredevils | bat | no result | 0 | Unknown | 0 | 0 | NaN | Feroz Shah Kotla | SS Hazare | RJ Tucker |
| 545 | 546 | 2015 | Bangalore | 29-04-2015 | Royal Challengers Bangalore | Rajasthan Royals | Rajasthan Royals | field | no result | 0 | Unknown | 0 | 0 | NaN | M Chinnaswamy Stadium | JD Cloete | PG Pathak |
| 570 | 571 | 2015 | Bangalore | 17-05-2015 | Delhi Daredevils | Royal Challengers Bangalore | Royal Challengers Bangalore | field | no result | 0 | Unknown | 0 | 0 | NaN | M Chinnaswamy Stadium | HDPK Dharmasena | K Srinivasan |
| 744 | 11340 | 2019 | Bengaluru | 30-04-2019 | Royal Challengers Bangalore | Rajasthan Royals | Rajasthan Royals | field | no result | 0 | Unknown | 0 | 0 | NaN | M. Chinnaswamy Stadium | Nigel Llong | Ulhas Gandhe |
#checking the data and datatypes of the columns
df_ipl.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 756 entries, 0 to 755 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 756 non-null int64 1 season 756 non-null int64 2 city 756 non-null object 3 date 756 non-null object 4 team1 756 non-null object 5 team2 756 non-null object 6 toss_winner 756 non-null object 7 toss_decision 756 non-null object 8 result 756 non-null object 9 dl_applied 756 non-null int64 10 winner 756 non-null object 11 win_by_runs 756 non-null int64 12 win_by_wickets 756 non-null int64 13 player_of_match 752 non-null category 14 venue 756 non-null object 15 umpire1 756 non-null object 16 umpire2 756 non-null object dtypes: category(1), int64(5), object(11) memory usage: 106.0+ KB
#changing the data type of the columns
df_ipl['player_of_match']=df_ipl['player_of_match'].astype('object')
df_ipl.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 756 entries, 0 to 755 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 756 non-null int64 1 season 756 non-null int64 2 city 756 non-null object 3 date 756 non-null object 4 team1 756 non-null object 5 team2 756 non-null object 6 toss_winner 756 non-null object 7 toss_decision 756 non-null object 8 result 756 non-null object 9 dl_applied 756 non-null int64 10 winner 756 non-null object 11 win_by_runs 756 non-null int64 12 win_by_wickets 756 non-null int64 13 player_of_match 752 non-null object 14 venue 756 non-null object 15 umpire1 756 non-null object 16 umpire2 756 non-null object dtypes: int64(5), object(12) memory usage: 100.5+ KB
#making sure the we have not missed to treat any null values
df_ipl.isnull().sum().sort_values(ascending=False)
id 0 dl_applied 0 umpire1 0 venue 0 player_of_match 0 win_by_wickets 0 win_by_runs 0 winner 0 result 0 season 0 toss_decision 0 toss_winner 0 team2 0 team1 0 date 0 city 0 umpire2 0 dtype: int64
#Checking which team won the toss most times
df_ipl["toss_winner"].value_counts().head()
Mumbai Indians 98 Kolkata Knight Riders 92 Chennai Super Kings 89 Royal Challengers Bangalore 81 Kings XI Punjab 81 Name: toss_winner, dtype: int64
#checking which team bat first and won the match with the plot below
batingfirst = df_ipl[df_ipl['win_by_runs']!=0]
fig = px.histogram(batingfirst['win_by_runs'])
fig.update_layout(title_text='Distribution by Runs')
fig.update_xaxes(title_text='Runs')
fig.update_yaxes(title_text='Match_count')
#Finding out the number of wins each team after batting first
batingfirst["winner"].value_counts()
Mumbai Indians 57 Chennai Super Kings 52 Kings XI Punjab 38 Kolkata Knight Riders 36 Royal Challengers Bangalore 35 Sunrisers Hyderabad 30 Rajasthan Royals 27 Delhi Daredevils 25 Deccan Chargers 18 Pune Warriors 6 Rising Pune Supergiant 5 Delhi Capitals 3 Kochi Tuskers Kerala 2 Rising Pune Supergiants 2 Gujarat Lions 1 Name: winner, dtype: int64
#checking the second bating and who wont the match most by wickets with plots
secondbating = df_ipl[df_ipl['win_by_wickets']!=0]
fig = px.histogram(secondbating['win_by_wickets'])
fig.update_layout(title_text='Distribution by Runs')
fig.update_xaxes(title_text='Wikets')
fig.update_yaxes(title_text='Match_count')
#secondbated and wont the most match
secondbating['winner'].value_counts()
Kolkata Knight Riders 56 Mumbai Indians 50 Royal Challengers Bangalore 48 Chennai Super Kings 48 Rajasthan Royals 46 Kings XI Punjab 42 Delhi Daredevils 42 Sunrisers Hyderabad 27 Gujarat Lions 12 Deccan Chargers 11 Pune Warriors 6 Delhi Capitals 6 Rising Pune Supergiant 5 Kochi Tuskers Kerala 4 Rising Pune Supergiants 3 Name: winner, dtype: int64
#looking at the number of the matches played each season
df_ipl['season'].value_counts()
2013 76 2012 74 2011 73 2010 60 2014 60 2016 60 2018 60 2019 60 2017 59 2015 59 2008 58 2009 57 Name: season, dtype: int64
#looking at the number of the matches played each city
df_ipl['city'].value_counts()
Mumbai 101 Kolkata 77 Delhi 74 Bangalore 66 Hyderabad 64 Chennai 57 Jaipur 47 Chandigarh 46 Pune 38 Durban 15 Bengaluru 14 Visakhapatnam 13 Ahmedabad 12 Centurion 12 Rajkot 10 Mohali 10 Indore 9 Dharamsala 9 Johannesburg 8 Cape Town 7 Ranchi 7 Abu Dhabi 7 Cuttack 7 Unknown 7 Port Elizabeth 7 Sharjah 6 Raipur 6 Kochi 5 Kanpur 4 Nagpur 3 Kimberley 3 East London 3 Bloemfontein 2 Name: city, dtype: int64
#Checking the number of team those who won the toss also and match also
np.sum(df_ipl['toss_winner']==df_ipl['winner'])
393
#As it is proved toss dose not played a main role to win the match
result = 393/756
percentage = result * 100
print(percentage)
51.98412698412699
#importing other data set
deliveries = pd.read_csv("deliveries.csv")
#seeing top 5 records
deliveries.head()
| match_id | inning | batting_team | bowling_team | over | ball | batsman | non_striker | bowler | is_super_over | ... | bye_runs | legbye_runs | noball_runs | penalty_runs | batsman_runs | extra_runs | total_runs | player_dismissed | dismissal_kind | fielder | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 1 | DA Warner | S Dhawan | TS Mills | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN |
| 1 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 2 | DA Warner | S Dhawan | TS Mills | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN |
| 2 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 3 | DA Warner | S Dhawan | TS Mills | 0 | ... | 0 | 0 | 0 | 0 | 4 | 0 | 4 | NaN | NaN | NaN |
| 3 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 4 | DA Warner | S Dhawan | TS Mills | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN |
| 4 | 1 | 1 | Sunrisers Hyderabad | Royal Challengers Bangalore | 1 | 5 | DA Warner | S Dhawan | TS Mills | 0 | ... | 0 | 0 | 0 | 0 | 0 | 2 | 2 | NaN | NaN | NaN |
5 rows × 21 columns
#checking the rows and columns
deliveries.shape
(179078, 21)
#flitering the required details
srh = deliveries[(deliveries['match_id']==1) & (deliveries['inning']==1)]
#after filtering checking the SRH team team score
srh['batsman_runs'].value_counts()
1 57 0 32 4 17 6 9 2 9 3 1 Name: batsman_runs, dtype: int64
#checking how SRH team bats man got out
srh['dismissal_kind'].value_counts()
caught 3 bowled 1 Name: dismissal_kind, dtype: int64
#in the same way wt we used above checking the other teams score
RCB = deliveries[(deliveries['match_id']==1) & (deliveries['inning']==2)]
RCB['batsman_runs'].value_counts()
0 49 1 44 4 15 6 8 2 7 Name: batsman_runs, dtype: int64
# checking RCB team dismissals
RCB['dismissal_kind'].value_counts()
caught 6 bowled 2 run out 2 Name: dismissal_kind, dtype: int64